Final Project: Phase 2 - EDA¶

Spring 2024
Group: Michael Massone and Joseph Nelson Farrell
DS 5230 Unsupervised Machine Learning
Professor Steven Morin, PhD
Due: 03/11/2024


Libraries¶

In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import os
import sys
from pathlib import Path
from sklearn.preprocessing import LabelEncoder

from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

import warnings

Define File Paths¶

In [ ]:
# define path
nb_path = Path(os.getcwd())
print(nb_path)
path = str(nb_path.parent)
print(path)

# path to figs folder
figs_path = path + '/figs'

# path to data
data_path= path + '/data'

# path to src folder
src_path = path + '/src'
print(src_path)

# sys path
sys.path.append(src_path)
/Users/nelsonfarrell/Documents/Northeastern/5230/final_project/DS5230-final/notebooks
/Users/nelsonfarrell/Documents/Northeastern/5230/final_project/DS5230-final
/Users/nelsonfarrell/Documents/Northeastern/5230/final_project/DS5230-final/src

Functions¶

In [ ]:
from preprocessing_eda_utils import generate_column_hist
from preprocessing_eda_utils import sub_divide_pairplot

Parameters¶

In [ ]:
# transformed data csv file name
data_file = "/curated/trans_data_design.csv"

Load Data¶

In [ ]:
# read in design matrix
trans_df = pd.read_csv( data_path + data_file )

Exploratory Data Analysis¶

In [ ]:
# generate attributes list
attr_list = trans_df.columns

# display machine learning attributes list
print(f'Attributes List:')
for i, j in enumerate(attr_list):
    print(f'{i + 1}: {j}')
Attributes List:
1: numerical__Area
2: numerical__Perimeter
3: numerical__MajorAxisLength
4: numerical__MinorAxisLength
5: numerical__AspectRation
6: numerical__Eccentricity
7: numerical__ConvexArea
8: numerical__EquivDiameter
9: numerical__Extent
10: numerical__Solidity
11: numerical__roundness
12: numerical__Compactness
13: numerical__ShapeFactor1
14: numerical__ShapeFactor2
15: numerical__ShapeFactor3
16: numerical__ShapeFactor4
In [ ]:
# display dimension of transformed dataframe
print(f'Transformed Dataframe Dimensions:', trans_df[attr_list].shape)
Transformed Dataframe Dimensions: (13611, 16)
In [ ]:
# visually inspect transformed df
display(trans_df.head())
numerical__Area numerical__Perimeter numerical__MajorAxisLength numerical__MinorAxisLength numerical__AspectRation numerical__Eccentricity numerical__ConvexArea numerical__EquivDiameter numerical__Extent numerical__Solidity numerical__roundness numerical__Compactness numerical__ShapeFactor1 numerical__ShapeFactor2 numerical__ShapeFactor3 numerical__ShapeFactor4
0 -0.840749 -1.143319 -1.306598 -0.631153 -1.565053 -2.185720 -0.841451 -1.063341 0.289087 0.367613 1.423867 1.839116 0.680786 2.402173 1.925723 0.838371
1 -0.829188 -1.013924 -1.395911 -0.434445 -1.969784 -3.686040 -0.826102 -1.044217 0.697477 -0.462907 0.231054 2.495449 0.367967 3.100893 2.689702 0.771138
2 -0.807157 -1.078829 -1.252357 -0.585735 -1.514291 -2.045336 -0.808704 -1.008084 0.578195 0.518417 1.252865 1.764843 0.603129 2.235091 1.841356 0.916755
3 -0.785741 -0.977215 -1.278825 -0.439290 -1.741618 -2.742211 -0.773975 -0.973337 0.671260 -2.241767 0.515049 2.081715 0.401718 2.515075 2.204250 -0.197985
4 -0.781239 -1.097384 -1.380471 -0.266663 -2.117993 -4.535028 -0.784286 -0.966080 0.476020 0.804772 1.874992 2.765330 0.118268 3.270983 3.013462 0.939640
In [ ]:
# display transformed df general information
trans_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13611 entries, 0 to 13610
Data columns (total 16 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   numerical__Area             13611 non-null  float64
 1   numerical__Perimeter        13611 non-null  float64
 2   numerical__MajorAxisLength  13611 non-null  float64
 3   numerical__MinorAxisLength  13611 non-null  float64
 4   numerical__AspectRation     13611 non-null  float64
 5   numerical__Eccentricity     13611 non-null  float64
 6   numerical__ConvexArea       13611 non-null  float64
 7   numerical__EquivDiameter    13611 non-null  float64
 8   numerical__Extent           13611 non-null  float64
 9   numerical__Solidity         13611 non-null  float64
 10  numerical__roundness        13611 non-null  float64
 11  numerical__Compactness      13611 non-null  float64
 12  numerical__ShapeFactor1     13611 non-null  float64
 13  numerical__ShapeFactor2     13611 non-null  float64
 14  numerical__ShapeFactor3     13611 non-null  float64
 15  numerical__ShapeFactor4     13611 non-null  float64
dtypes: float64(16)
memory usage: 1.7 MB
In [ ]:
# display counts of NA, None, and np.nan
print('\nNA (np.nan or None) Count:\n',
        trans_df.isna().sum(), sep ='')
NA (np.nan or None) Count:
numerical__Area               0
numerical__Perimeter          0
numerical__MajorAxisLength    0
numerical__MinorAxisLength    0
numerical__AspectRation       0
numerical__Eccentricity       0
numerical__ConvexArea         0
numerical__EquivDiameter      0
numerical__Extent             0
numerical__Solidity           0
numerical__roundness          0
numerical__Compactness        0
numerical__ShapeFactor1       0
numerical__ShapeFactor2       0
numerical__ShapeFactor3       0
numerical__ShapeFactor4       0
dtype: int64
In [ ]:
# display proportion of NA, None, and np.nan
print('\nNA (np.nan or None) Ratio:\n',
        trans_df.isna().sum() / trans_df.shape[0], sep='')
NA (np.nan or None) Ratio:
numerical__Area               0.0
numerical__Perimeter          0.0
numerical__MajorAxisLength    0.0
numerical__MinorAxisLength    0.0
numerical__AspectRation       0.0
numerical__Eccentricity       0.0
numerical__ConvexArea         0.0
numerical__EquivDiameter      0.0
numerical__Extent             0.0
numerical__Solidity           0.0
numerical__roundness          0.0
numerical__Compactness        0.0
numerical__ShapeFactor1       0.0
numerical__ShapeFactor2       0.0
numerical__ShapeFactor3       0.0
numerical__ShapeFactor4       0.0
dtype: float64

Pairplots¶

In [ ]:
# ignore warnings
warnings.filterwarnings('ignore', category = FutureWarning)

# generate plot
plot = sns.pairplot(data = trans_df)
plot.fig.suptitle('Attribute Pairplots', fontsize = 60, weight = 'bold', style = "italic", y = 1.03)
plt.tight_layout()

# save fig
plt.savefig(figs_path + f"/pairplot_full.png", bbox_inches = 'tight')
No description has been provided for this image
In [ ]:
sub_divide_pairplot(trans_df)
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Histograms of Numerical Columns¶

In [ ]:
# divide numeric columns into groups of 4 for plotting
divided_columns = [attr_list[i:i+4] for i in range(0, len(attr_list), 4)]

# generate plots
for i, cols in enumerate(divided_columns):
    generate_column_hist(trans_df, cols)
    
    # save fig
    plt.savefig(figs_path + f"/attribute_hist_{i}.png", bbox_inches = 'tight')
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image